{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### FastText词向量\n",
    "该任务语料库较小，用fastText可以增加n-gram特征，比传统word2vec要好    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from utils import load_curpus"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 对语料库进行清洗和分词"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Building prefix dict from the default dictionary ...\n",
      "Loading model from cache /tmp/jieba.cache\n",
      "Loading model cost 0.502 seconds.\n",
      "Prefix dict has been built succesfully.\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "data = load_curpus(\"weibo2018/train.txt\") + load_curpus(\"weibo2018/test.txt\")\n",
    "df = pd.DataFrame(data, columns=[\"content\", \"sentiment\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>content</th>\n",
       "      <th>sentiment</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>[书中, 自有, 黄金屋, 书中, 自有, 颜如玉, 沿着, 岁月, 的, 长河, 跋涉, ...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>[这是, 英超, 被, 黑, 的, 最惨, 的, 一次, [二哈], [二哈], 十几年来,...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>[中国, 远洋, 海运, 集团, 副总经理, 俞曾, 港, 月, 日, 在, 上, 表示, ...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>[看, 流星花园, 其实, 也, 还好, 啦, 现在, 的, 观念, 以及, 时尚, 眼光,...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>[汉武帝, 的, 罪己, 诏, 的, 真实性, 尽管, 存在, 着, 争议, 然而, 轮台,...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                             content  sentiment\n",
       "0  [书中, 自有, 黄金屋, 书中, 自有, 颜如玉, 沿着, 岁月, 的, 长河, 跋涉, ...          1\n",
       "1  [这是, 英超, 被, 黑, 的, 最惨, 的, 一次, [二哈], [二哈], 十几年来,...          0\n",
       "2  [中国, 远洋, 海运, 集团, 副总经理, 俞曾, 港, 月, 日, 在, 上, 表示, ...          1\n",
       "3  [看, 流星花园, 其实, 也, 还好, 啦, 现在, 的, 观念, 以及, 时尚, 眼光,...          1\n",
       "4  [汉武帝, 的, 罪己, 诏, 的, 真实性, 尽管, 存在, 着, 争议, 然而, 轮台,...          1"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 训练词向量"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from gensim.models import FastText\n",
    "model = FastText(df[\"content\"], \n",
    "                 size=100,\n",
    "                 window=5, \n",
    "                 min_count=3, # 只保留出现次数大于3的词语\n",
    "                 iter=1000,  # 10000次训练\n",
    "                 min_n=2,     # 默认为3,因为文本是中文这里改为2\n",
    "                 max_n=4,     # 默认为6,因为文本是中文这里改为5\n",
    "                 word_ngrams=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 词向量效果展示"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/albert/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:1: DeprecationWarning: Call to deprecated `most_similar` (Method will be removed in 4.0.0, use self.wv.most_similar() instead).\n",
      "  \"\"\"Entry point for launching an IPython kernel.\n",
      "/home/albert/anaconda3/lib/python3.6/site-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.\n",
      "  if np.issubdtype(vec.dtype, np.int):\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[('你', 0.8299396634101868),\n",
       " ('他', 0.7914878129959106),\n",
       " ('她', 0.7295881509780884),\n",
       " ('自己', 0.6827410459518433),\n",
       " ('不', 0.649764358997345),\n",
       " ('你们', 0.6482383608818054),\n",
       " ('了', 0.6053299307823181),\n",
       " ('有人', 0.5951108932495117),\n",
       " ('他们', 0.5880480408668518),\n",
       " ('别人', 0.5770147442817688)]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.most_similar(\"我\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/albert/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:1: DeprecationWarning: Call to deprecated `most_similar` (Method will be removed in 4.0.0, use self.wv.most_similar() instead).\n",
      "  \"\"\"Entry point for launching an IPython kernel.\n",
      "/home/albert/anaconda3/lib/python3.6/site-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.\n",
      "  if np.issubdtype(vec.dtype, np.int):\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[('喜欢', 0.5164913535118103),\n",
       " ('小心', 0.49298298358917236),\n",
       " ('幸福', 0.4906578063964844),\n",
       " ('难过', 0.4871729612350464),\n",
       " ('难受', 0.4736190438270569),\n",
       " ('容易', 0.47024640440940857),\n",
       " ('生气', 0.46578270196914673),\n",
       " ('好开心', 0.4513933062553406),\n",
       " ('天天开心', 0.44929811358451843),\n",
       " ('好', 0.4428994655609131)]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.most_similar(\"开心\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/albert/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:1: DeprecationWarning: Call to deprecated `most_similar` (Method will be removed in 4.0.0, use self.wv.most_similar() instead).\n",
      "  \"\"\"Entry point for launching an IPython kernel.\n",
      "/home/albert/anaconda3/lib/python3.6/site-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.\n",
      "  if np.issubdtype(vec.dtype, np.int):\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[('管', 0.33231693506240845),\n",
       " ('tm', 0.3150600790977478),\n",
       " ('生菜', 0.3112187087535858),\n",
       " ('大神', 0.3058115243911743),\n",
       " ('卧', 0.30501842498779297),\n",
       " ('破产', 0.30370914936065674),\n",
       " ('逛逛', 0.30254125595092773),\n",
       " ('好好看', 0.29780805110931396),\n",
       " ('薛之谦', 0.2912497818470001),\n",
       " ('诶', 0.2890874147415161)]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.most_similar(\"卧槽\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/albert/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:1: DeprecationWarning: Call to deprecated `most_similar` (Method will be removed in 4.0.0, use self.wv.most_similar() instead).\n",
      "  \"\"\"Entry point for launching an IPython kernel.\n",
      "/home/albert/anaconda3/lib/python3.6/site-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.\n",
      "  if np.issubdtype(vec.dtype, np.int):\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[('了', 0.45228898525238037),\n",
       " ('[doge]', 0.44459354877471924),\n",
       " ('啊啊啊', 0.39727917313575745),\n",
       " ('[阴险]', 0.39499837160110474),\n",
       " ('[笑cry]', 0.39007097482681274),\n",
       " ('我', 0.3849623203277588),\n",
       " ('[允悲]', 0.3687397241592407),\n",
       " ('[跪了]', 0.3672424852848053),\n",
       " ('[困]', 0.35909155011177063),\n",
       " ('啊', 0.35804903507232666)]"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.most_similar(\"[二哈]\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 保存模型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "model.save(\"model/model_100.txt\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
